library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(geosphere)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following object is masked from 'package:base':
## 
##     date
masterdata <- read.csv("new_MASTER_01_data.csv")
  
summary(masterdata)
##        X           tripduration       start.station.id
##  Min.   :     1   Min.   :     61.0   519    :  1558  
##  1st Qu.: 51380   1st Qu.:    363.0   497    :  1227  
##  Median :102759   Median :    616.0   3255   :  1200  
##  Mean   :102759   Mean   :    992.7   285    :  1145  
##  3rd Qu.:154138   3rd Qu.:   1081.0   402    :  1125  
##  Max.   :205517   Max.   :2678003.0   435    :  1089  
##                                       (Other):198173  
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  1558   Min.   :40.66          Min.   :-74.03         
##  E 17 St & Broadway   :  1227   1st Qu.:40.72          1st Qu.:-74.00         
##  8 Ave & W 31 St      :  1200   Median :40.74          Median :-73.99         
##  Broadway & E 14 St   :  1145   Mean   :40.74          Mean   :-73.98         
##  Broadway & E 22 St   :  1125   3rd Qu.:40.76          3rd Qu.:-73.97         
##  W 21 St & 6 Ave      :  1089   Max.   :40.86          Max.   :-73.89         
##  (Other)              :198173                                                 
##  end.station.id                end.station.name  end.station.latitude
##  519    :  1604   Pershing Square North:  1604   Min.   :40.66       
##  497    :  1254   E 17 St & Broadway   :  1254   1st Qu.:40.72       
##  402    :  1194   Broadway & E 22 St   :  1194   Median :40.74       
##  3255   :  1169   8 Ave & W 31 St      :  1169   Mean   :40.74       
##  285    :  1157   Broadway & E 14 St   :  1157   3rd Qu.:40.76       
##  426    :  1120   West St & Chambers St:  1120   Max.   :40.86       
##  (Other):198019   (Other)              :198019                       
##  end.station.longitude     bikeid            usertype        birth.year  
##  Min.   :-74.05        Min.   :14529   Customer  : 28805   Min.   :1886  
##  1st Qu.:-74.00        1st Qu.:25323   Subscriber:176712   1st Qu.:1969  
##  Median :-73.99        Median :30947                       Median :1983  
##  Mean   :-73.98        Mean   :29669                       Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35053                       3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                       Max.   :2003  
##                                                                          
##      gender           AWND        AWND_ATTRIBUTES      PRCP      
##  Min.   :0.000   Min.   : 1.120      : 22301      Min.   :0.000  
##  1st Qu.:1.000   1st Qu.: 2.910   ,,W:183216      1st Qu.:0.000  
##  Median :1.000   Median : 4.030                   Median :0.000  
##  Mean   :1.164   Mean   : 4.385                   Mean   :0.106  
##  3rd Qu.:1.000   3rd Qu.: 5.140                   3rd Qu.:0.040  
##  Max.   :2.000   Max.   :12.750                   Max.   :1.830  
##                  NA's   :22301                                   
##   PRCP_ATTRIBUTES        SNOW        SNOW_ATTRIBUTES        SNWD        
##  ,,W,2400 :186524   Min.   :0.000            :   545   Min.   :0.00000  
##  T,,W,2400: 18993   1st Qu.:0.000   ,,W,2400 :201841   1st Qu.:0.00000  
##                     Median :0.000   T,,W,2400:  3131   Median :0.00000  
##                     Mean   :0.019                      Mean   :0.02829  
##                     3rd Qu.:0.000                      3rd Qu.:0.00000  
##                     Max.   :4.000                      Max.   :3.90000  
##                     NA's   :545                                         
##   SNWD_ATTRIBUTES     TAVG         TAVG_ATTRIBUTES      TMAX      
##  ,,W,2400 :204127   Mode:logical   Mode:logical    Min.   :14.00  
##  T,,W,2400:  1390   NA's:205517    NA's:205517     1st Qu.:57.00  
##                                                    Median :71.00  
##                                                    Mean   :68.17  
##                                                    3rd Qu.:81.00  
##                                                    Max.   :95.00  
##                                                                   
##  TMAX_ATTRIBUTES      TMIN       TMIN_ATTRIBUTES      WDF2      
##  ,,W:205517      Min.   : 2.00   ,,W:205517      Min.   : 10.0  
##                  1st Qu.:42.00                   1st Qu.: 60.0  
##                  Median :56.00                   Median :220.0  
##                  Mean   :53.63                   Mean   :182.2  
##                  3rd Qu.:67.00                   3rd Qu.:280.0  
##                  Max.   :82.00                   Max.   :360.0  
##                                                  NA's   :22301  
##  WDF2_ATTRIBUTES      WDF5       WDF5_ATTRIBUTES      WSF2      
##     : 22301      Min.   : 10.0      : 22700      Min.   : 6.90  
##  ,,W:183216      1st Qu.: 70.0   ,,W:182817      1st Qu.:10.10  
##                  Median :220.0                   Median :12.10  
##                  Mean   :183.6                   Mean   :12.81  
##                  3rd Qu.:270.0                   3rd Qu.:15.00  
##                  Max.   :360.0                   Max.   :25.10  
##                  NA's   :22700                   NA's   :22301  
##  WSF2_ATTRIBUTES      WSF5       WSF5_ATTRIBUTES      WT01       
##     : 22301      Min.   :11.00      : 22700      Min.   :1       
##  ,,W:183216      1st Qu.:17.00   ,,W:182817      1st Qu.:1       
##                  Median :19.90                   Median :1       
##                  Mean   :20.77                   Mean   :1       
##                  3rd Qu.:23.00                   3rd Qu.:1       
##                  Max.   :40.90                   Max.   :1       
##                  NA's   :22700                   NA's   :123167  
##  WT01_ATTRIBUTES      WT02        WT02_ATTRIBUTES      WT03       
##     :123167      Min.   :1           :201690      Min.   :1       
##  ,,W: 82350      1st Qu.:1        ,,W:  3827      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :201690                   NA's   :186419  
##  WT03_ATTRIBUTES      WT06        WT06_ATTRIBUTES      WT08       
##     :186419      Min.   :1           :204101      Min.   :1       
##  ,,W: 19098      1st Qu.:1        ,,W:  1416      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :204101                   NA's   :172801  
##  WT08_ATTRIBUTES                   newStartTime   
##     :172801      2019-03-01 17:41:27.7210:     2  
##  ,,W: 32716      2019-07-31 17:48:23.5580:     2  
##                  2019-01-01 00:35:03.5980:     1  
##                  2019-01-01 01:14:01.5150:     1  
##                  2019-01-01 01:59:10.1080:     1  
##                  2019-01-01 02:47:03.7040:     1  
##                  (Other)                 :205509  
##                    newStopTime    
##  2019-05-28 09:10:01.3380:     2  
##  2019-07-12 08:43:08.3900:     2  
##  2019-01-01 00:38:10.6250:     1  
##  2019-01-01 01:58:41.1290:     1  
##  2019-01-01 02:12:34.9820:     1  
##  2019-01-01 02:55:16.4380:     1  
##  (Other)                 :205509
str(masterdata)
## 'data.frame':    205517 obs. of  48 variables:
##  $ X                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ tripduration           : int  110 1067 325 552 282 1150 178 777 423 144 ...
##  $ start.station.id       : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
##  $ start.station.name     : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
##  $ start.station.latitude : num  40.8 40.8 40.8 40.7 40.7 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
##  $ end.station.name       : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
##  $ end.station.latitude   : num  40.8 40.7 40.8 40.7 40.7 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : int  38891 38269 14654 15101 32868 30584 32492 30258 36783 36111 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
##  $ birth.year             : int  1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
##  $ gender                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ AWND                   : num  3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
##  $ AWND_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ PRCP                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PRCP_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
##  $ SNOW                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOW_ATTRIBUTES        : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SNWD                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TAVG                   : logi  NA NA NA NA NA NA ...
##  $ TAVG_ATTRIBUTES        : logi  NA NA NA NA NA NA ...
##  $ TMAX                   : int  87 39 70 87 85 85 80 88 49 60 ...
##  $ TMAX_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TMIN                   : int  73 32 52 75 72 68 63 75 33 38 ...
##  $ TMIN_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WDF2                   : int  70 250 40 60 220 290 150 140 10 260 ...
##  $ WDF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WDF5                   : int  40 220 40 70 220 290 150 140 360 260 ...
##  $ WDF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF2                   : num  8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
##  $ WSF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF5                   : num  17 19.9 13 13 19 15 16.1 15 25.1 23 ...
##  $ WSF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WT01                   : int  1 NA NA NA NA NA NA NA NA NA ...
##  $ WT01_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
##  $ WT02                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT02_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT03                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT03_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT06                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT06_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT08                   : int  NA 1 NA NA NA NA 1 NA NA NA ...
##  $ WT08_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
##  $ newStartTime           : Factor w/ 205515 levels "2019-01-01 00:35:03.5980",..: 124947 197101 150074 92205 108911 100289 70041 126789 29559 28200 ...
##  $ newStopTime            : Factor w/ 205515 levels "2019-01-01 00:38:10.6250",..: 124930 197107 150056 92205 108899 100292 70035 126787 29556 28200 ...
masterdata$bikeid <- as.factor(masterdata$bikeid)
masterdata$gender <- as.factor(masterdata$gender)

masterdata$gender <- as.factor(ifelse(masterdata$gender == "0", "Unknown", ifelse(masterdata$gender == "1", "Male", "Female")))
masterdata$X <- NULL
masterdata$starttime <- NULL
masterdata$stoptime <- NULL

masterdata$newStartTime = as.POSIXct(strptime(masterdata$newStartTime, "%Y-%m-%d %H:%M:%S"))
masterdata$newStopTime = as.POSIXct(strptime(masterdata$newStopTime, "%Y-%m-%d %H:%M:%S"))

masterdata$newStartDate <- as.Date(masterdata$newStartTime)
masterdata$newStopDate <- as.Date(masterdata$newStopTime)

#distance
masterstart <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterstart$startlong <- as.numeric(masterdata$start.station.longitude)
masterstart$startlat <- as.numeric(masterdata$start.station.latitude)

masterend <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterend$endlong <- masterdata$end.station.longitude
masterend$endlat <- masterdata$end.station.latitude

masterdata$distanceH <- distHaversine(masterstart, masterend, r=6378137)

#speed column
masterdata$speedMetersperMin <- masterdata$distanceH / masterdata$tripduration

masterend <- NULL
masterstart <- NULL

#new age classification
masterdata$agegroup <- as.factor(ifelse(masterdata$birth.year >= 2000, "GenZ", ifelse(masterdata$birth.year >= 1981, "Millennial", ifelse(masterdata$birth.year >= 1965, "GenX", ifelse(masterdata$birth.year >= 1946, "Boomer", ifelse(masterdata$birth.year >= 1928, "Silent", "VeryOld"))))))

#order factor levels for visualizations
masterdata$agegroup <- factor(masterdata$agegroup, levels = c("GenZ", "Millennial", "GenX", "Boomer", "Silent", "VeryOld"))

#new columns for months for visualizations
masterdata$startMonth <- month(masterdata$newStartDate)
masterdata$stopMonth <- month(masterdata$newStopDate)
masterdata$startMonthFactor <- as.factor(month(masterdata$newStartDate))
masterdata$stopMonthFactor <- as.factor(month(masterdata$newStopDate))
str(masterdata)
## 'data.frame':    205517 obs. of  56 variables:
##  $ tripduration           : int  110 1067 325 552 282 1150 178 777 423 144 ...
##  $ start.station.id       : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
##  $ start.station.name     : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
##  $ start.station.latitude : num  40.8 40.8 40.8 40.7 40.7 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
##  $ end.station.name       : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
##  $ end.station.latitude   : num  40.8 40.7 40.8 40.7 40.7 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : Factor w/ 19094 levels "14529","14530",..: 16053 15465 97 422 11882 9967 11539 9673 14577 14187 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
##  $ birth.year             : int  1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
##  $ gender                 : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ AWND                   : num  3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
##  $ AWND_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ PRCP                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PRCP_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
##  $ SNOW                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOW_ATTRIBUTES        : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SNWD                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TAVG                   : logi  NA NA NA NA NA NA ...
##  $ TAVG_ATTRIBUTES        : logi  NA NA NA NA NA NA ...
##  $ TMAX                   : int  87 39 70 87 85 85 80 88 49 60 ...
##  $ TMAX_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TMIN                   : int  73 32 52 75 72 68 63 75 33 38 ...
##  $ TMIN_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WDF2                   : int  70 250 40 60 220 290 150 140 10 260 ...
##  $ WDF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WDF5                   : int  40 220 40 70 220 290 150 140 360 260 ...
##  $ WDF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF2                   : num  8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
##  $ WSF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF5                   : num  17 19.9 13 13 19 15 16.1 15 25.1 23 ...
##  $ WSF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WT01                   : int  1 NA NA NA NA NA NA NA NA NA ...
##  $ WT01_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
##  $ WT02                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT02_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT03                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT03_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT06                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT06_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT08                   : int  NA 1 NA NA NA NA 1 NA NA NA ...
##  $ WT08_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
##  $ newStartTime           : POSIXct, format: "2019-08-17 13:10:36" "2019-12-04 18:48:45" ...
##  $ newStopTime            : POSIXct, format: "2019-08-17 13:12:27" "2019-12-04 19:06:32" ...
##  $ newStartDate           : Date, format: "2019-08-17" "2019-12-04" ...
##  $ newStopDate            : Date, format: "2019-08-17" "2019-12-05" ...
##  $ distanceH              : num  413 2567 922 657 1099 ...
##  $ speedMetersperMin      : num  3.75 2.41 2.84 1.19 3.9 ...
##  $ agegroup               : Factor w/ 6 levels "GenZ","Millennial",..: 2 3 2 3 2 2 4 2 3 4 ...
##  $ startMonth             : num  8 12 9 7 7 7 6 8 3 3 ...
##  $ stopMonth              : num  8 12 9 7 7 7 6 8 3 3 ...
##  $ startMonthFactor       : Factor w/ 12 levels "1","2","3","4",..: 8 12 9 7 7 7 6 8 3 3 ...
##  $ stopMonthFactor        : Factor w/ 12 levels "1","2","3","4",..: 8 12 9 7 7 7 6 8 3 3 ...
summary(masterdata)
##   tripduration       start.station.id             start.station.name
##  Min.   :     61.0   519    :  1558   Pershing Square North:  1558  
##  1st Qu.:    363.0   497    :  1227   E 17 St & Broadway   :  1227  
##  Median :    616.0   3255   :  1200   8 Ave & W 31 St      :  1200  
##  Mean   :    992.7   285    :  1145   Broadway & E 14 St   :  1145  
##  3rd Qu.:   1081.0   402    :  1125   Broadway & E 22 St   :  1125  
##  Max.   :2678003.0   435    :  1089   W 21 St & 6 Ave      :  1089  
##                      (Other):198173   (Other)              :198173  
##  start.station.latitude start.station.longitude end.station.id  
##  Min.   :40.66          Min.   :-74.03          519    :  1604  
##  1st Qu.:40.72          1st Qu.:-74.00          497    :  1254  
##  Median :40.74          Median :-73.99          402    :  1194  
##  Mean   :40.74          Mean   :-73.98          3255   :  1169  
##  3rd Qu.:40.76          3rd Qu.:-73.97          285    :  1157  
##  Max.   :40.86          Max.   :-73.89          426    :  1120  
##                                                 (Other):198019  
##               end.station.name  end.station.latitude end.station.longitude
##  Pershing Square North:  1604   Min.   :40.66        Min.   :-74.05       
##  E 17 St & Broadway   :  1254   1st Qu.:40.72        1st Qu.:-74.00       
##  Broadway & E 22 St   :  1194   Median :40.74        Median :-73.99       
##  8 Ave & W 31 St      :  1169   Mean   :40.74        Mean   :-73.98       
##  Broadway & E 14 St   :  1157   3rd Qu.:40.76        3rd Qu.:-73.97       
##  West St & Chambers St:  1120   Max.   :40.86        Max.   :-73.89       
##  (Other)              :198019                                             
##      bikeid             usertype        birth.year       gender      
##  35306  :    44   Customer  : 28805   Min.   :1886   Female : 49419  
##  34019  :    41   Subscriber:176712   1st Qu.:1969   Male   :140370  
##  34958  :    41                       Median :1983   Unknown: 15728  
##  35029  :    41                       Mean   :1980                   
##  35324  :    41                       3rd Qu.:1990                   
##  33885  :    40                       Max.   :2003                   
##  (Other):205269                                                      
##       AWND        AWND_ATTRIBUTES      PRCP        PRCP_ATTRIBUTES  
##  Min.   : 1.120      : 22301      Min.   :0.000   ,,W,2400 :186524  
##  1st Qu.: 2.910   ,,W:183216      1st Qu.:0.000   T,,W,2400: 18993  
##  Median : 4.030                   Median :0.000                     
##  Mean   : 4.385                   Mean   :0.106                     
##  3rd Qu.: 5.140                   3rd Qu.:0.040                     
##  Max.   :12.750                   Max.   :1.830                     
##  NA's   :22301                                                      
##       SNOW        SNOW_ATTRIBUTES        SNWD          SNWD_ATTRIBUTES  
##  Min.   :0.000            :   545   Min.   :0.00000   ,,W,2400 :204127  
##  1st Qu.:0.000   ,,W,2400 :201841   1st Qu.:0.00000   T,,W,2400:  1390  
##  Median :0.000   T,,W,2400:  3131   Median :0.00000                     
##  Mean   :0.019                      Mean   :0.02829                     
##  3rd Qu.:0.000                      3rd Qu.:0.00000                     
##  Max.   :4.000                      Max.   :3.90000                     
##  NA's   :545                                                            
##    TAVG         TAVG_ATTRIBUTES      TMAX       TMAX_ATTRIBUTES      TMIN      
##  Mode:logical   Mode:logical    Min.   :14.00   ,,W:205517      Min.   : 2.00  
##  NA's:205517    NA's:205517     1st Qu.:57.00                   1st Qu.:42.00  
##                                 Median :71.00                   Median :56.00  
##                                 Mean   :68.17                   Mean   :53.63  
##                                 3rd Qu.:81.00                   3rd Qu.:67.00  
##                                 Max.   :95.00                   Max.   :82.00  
##                                                                                
##  TMIN_ATTRIBUTES      WDF2       WDF2_ATTRIBUTES      WDF5      
##  ,,W:205517      Min.   : 10.0      : 22301      Min.   : 10.0  
##                  1st Qu.: 60.0   ,,W:183216      1st Qu.: 70.0  
##                  Median :220.0                   Median :220.0  
##                  Mean   :182.2                   Mean   :183.6  
##                  3rd Qu.:280.0                   3rd Qu.:270.0  
##                  Max.   :360.0                   Max.   :360.0  
##                  NA's   :22301                   NA's   :22700  
##  WDF5_ATTRIBUTES      WSF2       WSF2_ATTRIBUTES      WSF5      
##     : 22700      Min.   : 6.90      : 22301      Min.   :11.00  
##  ,,W:182817      1st Qu.:10.10   ,,W:183216      1st Qu.:17.00  
##                  Median :12.10                   Median :19.90  
##                  Mean   :12.81                   Mean   :20.77  
##                  3rd Qu.:15.00                   3rd Qu.:23.00  
##                  Max.   :25.10                   Max.   :40.90  
##                  NA's   :22301                   NA's   :22700  
##  WSF5_ATTRIBUTES      WT01        WT01_ATTRIBUTES      WT02       
##     : 22700      Min.   :1           :123167      Min.   :1       
##  ,,W:182817      1st Qu.:1        ,,W: 82350      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :123167                   NA's   :201690  
##  WT02_ATTRIBUTES      WT03        WT03_ATTRIBUTES      WT06       
##     :201690      Min.   :1           :186419      Min.   :1       
##  ,,W:  3827      1st Qu.:1        ,,W: 19098      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :186419                   NA's   :204101  
##  WT06_ATTRIBUTES      WT08        WT08_ATTRIBUTES  newStartTime                
##     :204101      Min.   :1           :172801      Min.   :2019-01-01 00:35:03  
##  ,,W:  1416      1st Qu.:1        ,,W: 32716      1st Qu.:2019-05-03 06:30:28  
##                  Median :1                        Median :2019-07-18 16:48:45  
##                  Mean   :1                        Mean   :2019-07-12 13:31:09  
##                  3rd Qu.:1                        3rd Qu.:2019-09-23 18:04:58  
##                  Max.   :1                        Max.   :2019-12-31 23:33:21  
##                  NA's   :172801                                                
##   newStopTime                   newStartDate         newStopDate        
##  Min.   :2019-01-01 00:38:10   Min.   :2019-01-01   Min.   :2019-01-01  
##  1st Qu.:2019-05-03 06:49:10   1st Qu.:2019-05-03   1st Qu.:2019-05-03  
##  Median :2019-07-18 16:59:56   Median :2019-07-18   Median :2019-07-18  
##  Mean   :2019-07-12 13:47:42   Mean   :2019-07-12   Mean   :2019-07-12  
##  3rd Qu.:2019-09-23 18:18:30   3rd Qu.:2019-09-23   3rd Qu.:2019-09-23  
##  Max.   :2020-01-02 09:26:42   Max.   :2020-01-01   Max.   :2020-01-02  
##                                                                         
##    distanceH       speedMetersperMin       agegroup        startMonth   
##  Min.   :    0.0   Min.   :0.000     GenZ      :  2173   Min.   : 1.00  
##  1st Qu.:  825.9   1st Qu.:1.946     Millennial:112458   1st Qu.: 5.00  
##  Median : 1375.9   Median :2.504     GenX      : 67976   Median : 7.00  
##  Mean   : 1779.9   Mean   :2.441     Boomer    : 22129   Mean   : 6.86  
##  3rd Qu.: 2305.2   3rd Qu.:3.033     Silent    :   701   3rd Qu.: 9.00  
##  Max.   :13812.2   Max.   :8.356     VeryOld   :    80   Max.   :12.00  
##                                                                         
##    stopMonth     startMonthFactor stopMonthFactor
##  Min.   : 1.00   9      :24692    9      :24685  
##  1st Qu.: 5.00   8      :23606    8      :23609  
##  Median : 7.00   7      :21700    7      :21698  
##  Mean   : 6.86   10     :21089    10     :21096  
##  3rd Qu.: 9.00   6      :21064    6      :21069  
##  Max.   :12.00   5      :19218    5      :19209  
##                  (Other):74148    (Other):74151
ggplot(data=masterdata, aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()

#newStartDate vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_violin()
## Warning: position_dodge requires non-overlapping x intervals

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_boxplot()

#newStartDate vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_boxplot()

As one might expect, trip duration increases during warmer months and decreases as temperature drops; this suggests that traveling longer distances is either more necessary or enjoyable in warmer months. Females on average have longer trips than men. Unknown gender has the highest trip duration, and customers have higher trip durations than subscribers. Perhaps customers do not have to reveal their gender information, and perhaps these customers differ in ways other than just status as it pertains to their trip duration. Citibike managers should keep in mind that any sort of system overhauls, construction, or repair should be placed in a month with less demand so the company does not miss out on revenue from peak times.

nrow(masterdata[masterdata$PRCP < .5,])
## [1] 187806
nrow(masterdata[masterdata$PRCP >= .5 & masterdata$PRCP < 1,])
## [1] 13168
nrow(masterdata[masterdata$PRCP >= 1 & masterdata$PRCP < 1.5,])
## [1] 2712
nrow(masterdata[masterdata$PRCP >= 1.5,])
## [1] 1831

These numbers will guide the analysis below, as it is important to note that, while the averages on the y-axis may provide suggest certain insights, looking at the confidence intervals at various ranges will be useful in drawing meaningful insights. As these metrics indicate, PRCP certainly has a negative correlation with number of rides that occur, which suggests that bikers in higher PRCP may not be reflective of the typical Citibike biker.

#prcp vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_boxplot()

#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_boxplot()

As precipitation increases, trip duration decreases. Females again have higher average trip duration, but they seem to have a varied correlation to preciptation. Perhaps the rise/plateau at the high PRCP levels for both males and females is influenced by people who use Citibike out of necessity. This means that the primary decrease in trip duration as PRCP increases is logical, as people who can make their trips shorter will. However, beyond a certain point, the people who cannot adjust their travel will then be bringing up the overall average trip duration. Unknown genders, who may be those who are not regular users of Citibike, are likely casual bikers who will decrease their trip lengths as much as posssible, and this is what the visualization depicts. It is curious that customers have inconsistent correlation to PRCP values. Perhaps we can infer that some rain deters users from taking long trips, while there is a certain amount of rain that is considered pleasant; this certain amount can also be an amount where casual riders do not ride, and so only bikers who bike out of need are biking in the middle range. After this middle range, perhaps even those bikers begin having to compromise on their trip lengths. Biking speed may also fluctuate and be responsible for trip duration changes.

#prcp vs distanceH by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_boxplot()

#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_boxplot()

Amongst unknown genders, PRCP is associated with a decrease in distance. For males and females, there seems to be a decrease in distance as PRCP increases to a certain level, after which the rate of decrease diminishes. For females, the distance begins to increase, whereas for males it mostly plateaus. This, as seen previously, may be reflective of who is biking in these various PRCP ranges. In the middle range, we can infer that people try to minimize distance if they can feasibly. Perhaps as PRCP becomes drastic, only those with a need to bike will be out, who may be not be able to adjust the distance of their trip. The disparity between male response and female response here is curious. Customers, who are likely recreational/infrequent users, predictably decrease distance in correlation to increased PRCP. Subscribers reflect a response similar to the females mentioned previously.

averagePRCPMonthly <- tapply(masterdata$PRCP,masterdata$startMonthFactor,mean,)
plot(averagePRCPMonthly,xlab="Month",ylab="Average PRCP")

averageTripDurationMonthly <- tapply(masterdata$tripduration,masterdata$startMonthFactor,mean,)
plot(averageTripDurationMonthly,xlab="Month",ylab="Average Trip Duration")

numTripsMonthly <- table(masterdata$startMonth)

plot(x=averagePRCPMonthly, y=averageTripDurationMonthly)

plot(x=averagePRCPMonthly, y=numTripsMonthly)

ggplot(data=masterdata, aes(x=PRCP,y=tripduration)) + geom_point() + facet_wrap(~ startMonthFactor)

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

It does not appear that months with higher average PRCP correspond to lower average trip durations. This may be due to the rainier months also being warmer and more pleasant than harsh winters of NY. Perhaps the pleasant days in rainy months are very positive for bikers in general, to the extent that they compensate for rainy days. We can see that, in different months, the amount of PRCP has varied correlations with trip duration. The winter months have little to know average tripduration changes as PRCP increases, which may reflect that bikers who ride during these times are not responsive to PRCP. Customers primarily decrease trip duration as PRCP increases, except in December and June, which may be months where tourists are determined to bike no matter the PRCP; subscribers vary greatly in their responses to PRCP in each month. Similar insights can be drawn when arranging the data by gender and usertype.

ggplot(data=masterdata, aes(x=PRCP,y=distanceH)) + geom_point() + facet_wrap(~ startMonthFactor)

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

In colder months and August, it seems that PRCP does not have significant correlation to average distance. In other months, the correlation fluctuates or steadily yields lower distance as PRCP increases. Interestingly, April which is a very rainy month traditionally seems to have the greatest fluctuation for distance’s correlation with PRCP. Customers seem reliably unaffected by PRCP values in aggregate, except for a few interesting examples in August and May. Subscribers, again, vary greatly in their response, which may suggest that we must look into the behavioral trends of specific users to gain a full picture. While most insights from this data is fundamentally speculative, it is interesting to note the disparity in how females, males, and unknown genders vary in their response to PRCP, when separated into usertypes. Female customers seem unbothered, while female subscribers decrease distances up until a certain point and then increase again (potentially due to only necessary rides being made, which are not responsive to PRCP changes). Male customers strongly decrease distance as PRCP increases, while male subscribers reflect a similar pattern as female subscribers (potentially due to the aforementioned insight). Similar insights are yielded by separating user types into genders.

speed

ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin)) + geom_point()

ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin, colour = gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin, colour = usertype)) + geom_smooth() + facet_wrap(~ agegroup)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

PRCP has a general positive correlation with speed, which may indicate that bikers bike faster in rainier weather. It is important to note certain fluctuations in this correlation. Perhaps the dip in speed around PRCP=1 may indicate that this amount of rain is particularly difficult to bike in, which causes bikers to slow down.